Set-up

library(here) # for reproducible paths
library(SingleCellExperiment)
library(scater) # For qc
library(org.Mm.eg.db) # To annotate the genenames
sce <- readRDS(here("processed", "sce.RDS"))

Gene QC

We start by removing genes not expressed in any cell

genes_beforeqc <- dim(sce)[1]
keep_feature <- rowSums(counts(sce) > 0) > 0
sce <- sce[keep_feature, ]
genes_beforeqc -dim(sce)[1] 
## [1] 6511

6511 cells are not expressed in any cell.

Add cell QC metrics to the sce

First are sorted the genenames and gene symbols, because the default ensembl notation is not very handy. And then save the mitochondrial genes as such.

if(!file.exists(here("processed", "sce_preliminary.RDS"))){
# obtain full genenames
genename <- mapIds(org.Mm.eg.db, keys = rownames(sce), keytype = "ENSEMBL", column = c("GENENAME"))
# Use the symbols as rownames
# first make gene names unique 
# TODO: save duplicate gene name list
symb_unique <- uniquifyFeatureNames(rownames(sce), rowData(sce)[, "Symbol"])
# Now they can be used as rownames
rownames(sce) <- symb_unique
# Add full gene names and the uniuqe symbols to the rowdata
rowData(sce)$symb_uniq <- symb_unique
rowData(sce)$gene_name <- genename
# Subset the mitochondrial genes
is_mito <- grepl("^mt-", rownames(sce))
}else{
  sce <- readRDS(here("processed", "sce_preliminary.RDS"))
}

Then we can use the scater package to add the quality per cell ## First try log for low thresholds but not for high. In low this prevents negative thresholds

if(!file.exists(here("processed", "sce_preliminary.RDS"))){
sce <- addPerCellQC(sce, subsets = list(mt=is_mito))
# Automated outlier detection
outlier_lib_low <- isOutlier(sce$total, log = TRUE, type = "lower")
outlier_expr_low <-
  isOutlier(sce$detected, log = TRUE, type = "lower")
outlier_lib_high <- isOutlier(sce$total, type = "higher")
outlier_expr_high <-
  isOutlier(sce$detected, type = "higher")
outlier_mt <- isOutlier(sce$subsets_mt_percent, type = "higher")
# total
outlier <-
  outlier_lib_low |
  outlier_expr_low |
  outlier_lib_high | outlier_expr_high | outlier_mt
# See how many cells are detected as outliers
DataFrame(
  lib_size_high = sum(outlier_lib_high),
  expression_high = sum(outlier_expr_high),
  lib_size_low = sum(outlier_lib_low),
  expression_low = sum(outlier_expr_low),
  mt_pct = sum(outlier_mt),
  total = sum(outlier)
)
# And the thresholds
attr(outlier_lib_high, "thresholds")
attr(outlier_expr_high, "thresholds")
attr(outlier_lib_low, "thresholds")
attr(outlier_expr_low, "thresholds")
attr(outlier_mt, "thresholds")
# TODO do the analysis separating by batch?
# Add if it is an outlier to the metadata
sce$outlier <- outlier

}else{
  sce <- readRDS(here("processed", "sce_preliminary.RDS"))
}

Plots before QC

Diagnostic plots to visualize the data distribution. The orange cells are marked as outliers by the automatic detection from scater. The thresholds are not very useful, so these will poblably be ignored.

Violin plots

plotColData(sce, x="Sample", y="sum", colour_by="outlier") + 
  ggtitle("Total count") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

plotColData(sce, x="Sample", y="sum", colour_by="outlier") + 
  scale_y_log10() + ggtitle("Total count log scale") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

plotColData(sce, x="Sample", y="detected", colour_by="outlier") + 
  scale_y_log10() + ggtitle("Detected Genes") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

plotColData(sce, x="Sample", y="sum", colour_by="chip") + 
  scale_y_log10() + ggtitle("total count by batch") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

plotColData(sce, x="Sample", y="subsets_mt_percent", colour_by="outlier") + ggtitle("Mitocchondrial percentatge") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Histograms

hist(
    sce$total,
    breaks = 100
)

hist(
    sce$detected,
    breaks = 100
)

hist(
    sce$subsets_mt_percent,
    breaks = 100
)

### Scatter plots

plotColData(sce, x="sum", y="subsets_mt_percent", colour_by="outlier")

plotColData(sce, x="sum", y="detected", colour_by="outlier")

plotColData(sce, x="sum", y="detected", colour_by="Sample")

PCA

Here we run a PCA using the information in the metadata instead of the gene expression. It is useful to visualize the QC parametres.

sce <- runColDataPCA(sce,  variables = c("sum","detected", "subsets_mt_percent"))
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "Sample")

sce$chip <- as.character(sce$chip)
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "chip")

Ratio between sum and gene counts

This measures will probably very useful to get rid of these cells that have low gene counts but a relatively high umi count. The isoutlier function did not work very well with the sum of umi and the detected genes. This function expects roughly a normal distribution. Bellow we try with the ratio between these two parametres, that do follow a normal distribution.

if(!file.exists(here("processed", "sce_preliminary.RDS"))){
sce$ratio_detected_sum <- sce$detected/sce$sum
sce$outlier_ratio <- isOutlier(sce$ratio_detected_sum, type = "both")
# Save the object at this stage with the label "preliminary analysis" 
# As only annotation addition has been done, without deleting anything yet.
saveRDS(sce, here("processed", "sce_preliminary.RDS"))
}else{
  sce <- readRDS(here("processed", "sce_preliminary.RDS"))
}

summary(sce$ratio_detected_sum)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.005707 0.326323 0.395047 0.404313 0.476884 0.878004
plotColData(sce, x="sum", y="detected", colour_by="ratio_detected_sum")

# Plot an histogram with the ratio between umi and gene counts
hist(
    sce$ratio_detected_sum,
    breaks = 100
)

# Use the is outlier function from scater to see the cutoffs suggestions
plotColData(sce, x="sum", y="detected", colour_by="outlier_ratio")

attr(sce$outlier_ratio, "thresholds")
##     lower    higher 
## 0.0649170 0.7251773

The suggestions here are very reasonable, it is not enough to delete all the cells we do not trust but we can definitely take this into consideration as a threshold parametre.

RE-do same plots as above with the ratio outlier marked

Violin plots

plotColData(sce, x="Sample", y="sum", colour_by="outlier_ratio") + 
  ggtitle("Total count") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

plotColData(sce, x="Sample", y="sum", colour_by="outlier") + 
  scale_y_log10() + ggtitle("Total count log scale") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

plotColData(sce, x="Sample", y="detected", colour_by="outlier_ratio") + 
  scale_y_log10() + ggtitle("Detected Genes") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Scatter plots

plotColData(sce, x="sum", y="subsets_mt_percent", colour_by="outlier_ratio")

plotColData(sce, x="sum", y="detected", colour_by="outlier_ratio")

PCA

Here we run a PCA using the information in the metadata instead of the gene expression. It is useful to visualize the QC parametres.

plotReducedDim(sce, dimred="PCA_coldata", colour_by = "outlier_ratio")